#!/usr/bin/env python
#coding:utf-8
"""
  Author:  程勇 --<>
  Purpose: 动脑学院第24节作业
  Created: 2018/3/22
"""

'''爬取url = 'http://tieba.baidu.com/p/2460150866'前三页所有的图片'''

import urllib.request,socket,re,sys,os,html.parser as h

base_url = "http://tieba.baidu.com"
save_path = "./scrapy/"
img_num = 1;
Webheader = {'Upgrade-Insecure-Requests':'1',
             'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}

def getContant(Weburl):
    req = urllib.request.Request(url = Weburl,headers = Webheader)
    respose = urllib.request.urlopen(req)
    _contant = respose.read()
    respose.close()
    return _contant.decode()

def get_imgUrl(URL):
    contant = getContant(URL)
    
    #匹配图片路径
    img_re = re.compile(r'<img pic_type="0" class="BDE_Image" src="(https://imgsa.baidu.com/.*?\.jpg|png|gif)" pic_ext="')
    img_url = img_re.findall(contant)
    
    #获取是否有下一页，及下一页的网址
    next_re = re.compile(r'<a href="(.*?)">下一页</a>')
    next_list = next_re.findall(contant)
    
    next_url = base_url + next_list[0] if next_list else ""
    next_bool = "yes" if next_list else "no"
    
    return img_url,next_url,next_bool
    
def save_img(img_url):
    global img_num
    #获取图片的后缀名
    split_list = img_url.split('.')
    img_path = save_path + '{}.{}'.format(img_num,split_list[3])
    
    try:
        urllib.request.urlretrieve(img_url,img_path)
    except FileNotFoundError as e:
        os.mkdir(save_path)
        urllib.request.urlretrieve(img_url,img_path)
        
    img_num += 1
    
def run():
    next_url = base_url + '/p/2460150866'
    next_bool = 'yes'
    num = 1
    
    while next_bool == 'yes' or num <= 3:
        img_url,next_url,next_bool = get_imgUrl(next_url)
        
        for url in img_url:
            save_img(url)
        
        num += 1
    
if __name__ == '__main__':
    run()
    